# The package "ezids" (EZ Intro to Data Science) includes a lot of the helper functions we developed for the course. 
# Importing the necessary libraries
library(ezids)
library(dplyr)
library(ggplot2)
library(DT)
library(corrplot)
library(lubridate)
library(tidyr)
library(scales)
library(cluster)
library(knitr)
library(kableExtra)
library(caret)
library(xgboost)
library(pROC)
library(e1071)
library(rpart)
library(rpart.plot)
library(randomForest)

Introduction

Our research focuses on exploring the featk

“Which are the top 5 app categories, as identified by classification models (logistic regression,SVM, XGBoost, KNN, and random forest), that significantly influenced app success (measured by installs) based on app data from 2010 to 2018, and how accurately can these models predict success trends within this time period??”

Loading the Dataset

Here, we have loaded the dataset ‘Google Play Store Apps’ stored in csv file using ()

#Loading the Dataset
data_apps <- data.frame(read.csv("googleplaystore.csv"))

Description of the App Dataset Columns

  1. App: The name of the application, represented as a character string.
  2. Category: The main category of the app, such as “ART_AND_DESIGN,” represented as a character string.
  3. Rating: The average user rating of the app, recorded as a numeric value.
  4. Reviews: The total number of user reviews for the app, shown as a character string.
  5. Size: The size of the application, represented as a character string.
  6. Installs: The approximate number of installations for the app, stored as a character string.
  7. Type: Indicates whether the app is free or paid, represented as a character string.
  8. Price: The price of the app, stored as a character string. Free apps are listed as “0,” while paid apps have a dollar amount.
  9. Content.Rating: The target age group for the app, represented as a character string.
  10. Genres: The genre(s) of the app.
  11. Last.Updated: The date of the app’s last update, stored as a character string.
  12. Current.Ver: The current version of the app, represented as a character string.
  13. Android.Ver: The minimum Android version required to run the app, stored as a character string.

Summary of dataset after Data Cleaning

#Dropping columns 
data_apps <- data_apps[, !(colnames(data_apps) %in% c("Android.Ver", "Current.Ver", "Type", "App", "Genres"))]

##Price----

### Convertion of Price to numerical 
data_apps$Price <- as.numeric(gsub("\\$", "", data_apps$Price)) #--Remove dollar symbols 
missing_na <- is.na(data_apps$Price)    
missing_blank <- data_apps$Price == "" 
data_apps <- data_apps[!is.na(data_apps$Price) & data_apps$Price != "", ] #-- Remove price is NA or blank

## Size---

#### Replacing Missing values with the mean (Size)
# Replace "Varies with Device" in the Size column with NA
data_apps$Size[data_apps$Size == "Varies with device"] <- NA #"Varies with Device" to NA
data_apps <- data_apps[!grepl("\\+", data_apps$Size), ]
data_apps$Size <- ifelse(grepl("k", data_apps$Size),
                          as.numeric(gsub("k", "", data_apps$Size)) *
0.001,  # Convert "K" to MB
                          as.numeric(gsub("M", "", data_apps$Size))) # Remove "M" for megabytes

# Calculate and display the mean size for each category in the 'Type' column
mean_size_by_type <- tapply(data_apps$Size, data_apps$Category,
mean, na.rm = TRUE)

# Loop through each row and replace NA values in the Size column with the mean size of the corresponding category
data_apps$Size <- ifelse(is.na(data_apps$Size),  # Check if Size is NA
  round(mean_size_by_type[data_apps$Category], 1), # Replace with the mean size based on the Category
  data_apps$Size)  # Keep the original size if it's not NA


##Installs---

####Remove the '+' sign, Remove the commas, Convert to numeric
#clean installations
clean_installs <- function(Installs) {
  Installs <- gsub("\\+", "", Installs)  
  Installs <- gsub(",", "", Installs)    
  return(as.numeric(Installs))           
}

data_apps$Installs <- sapply(data_apps$Installs, clean_installs)

nan_rows <- sapply(data_apps[, c("Size", "Installs")], function(x) any(is.nan(x)))

## Rating ---
data_apps <- data_apps %>%
  mutate(Rating = ifelse(is.na(Rating), mean(Rating, na.rm = TRUE), Rating))

# Identify the unique values in the 'Installs' column
unique_values <- unique(data_apps$Installs)

# Function to convert the installs to numeric
convert_to_numeric <- function(x) {
  # Remove non-numeric characters and convert to numeric
  as.numeric(gsub("[^0-9]", "", x)) * 10^(length(gregexpr(",", x)[[1]]) - 1)
}
# Sort unique values based on the custom numeric conversion
sorted_values <- unique_values[order(sapply(unique_values, convert_to_numeric))]

#Reviews---
data_apps$Reviews <- as.numeric(data_apps$Reviews)#Replace NA in Ratings with Overall Mean
data_apps <- data_apps %>%
  mutate(Rating = ifelse(is.na(Rating), mean(Rating, na.rm = TRUE), Rating))

#Content rating---
data_apps <- data_apps %>%
  mutate(
    Content.Rating = as.factor(Content.Rating)
  )
data_apps$Content.Rating <- as.numeric(data_apps$Content.Rating)
#### Preprocessing for a model
#categories----
category_dummies <- model.matrix(~ Category - 1, data = data_apps)
colnames(category_dummies) <- gsub("Category", "cat", colnames(category_dummies))

# 3. Add dummy variables to the dataset and remove the original 'Category' column
data_apps <- cbind(data_apps, category_dummies)
data_apps$Category <- NULL

# 4. Replace spaces in column names with underscores
colnames(data_apps) <- gsub(" ", "_", colnames(data_apps))


#### Installs----
# Load necessary libraries

# Create two categories: Low Installs and High Installs
# Calculate the median of Installs to split into two categories
median_installs <- median(data_apps$Installs, na.rm = TRUE)


#Reclassify into two categories
data_apps$Installs_Category <- ifelse(data_apps$Installs <= median_installs, "Low Installs", "High Installs")


# Convert 'Installs_Category' to factor with levels "Low Installs" and "High Installs"
data_apps$Installs_Category <- factor(data_apps$Installs_Category,
                                         levels = c("Low Installs", "High Installs"),
                                         labels = c(0, 1))


# Check the conversion
table(data_apps$Installs_Category)
## 
##    0    1 
## 5890 4950
# Create a histogram for the new categories
ggplot(data_apps, aes(x = Installs_Category)) +
  geom_bar(stat = "count", fill = "skyblue", color = "black") +
  labs(title = "Histogram of Installs Category (Low vs High)",
       x = "Installs Category",
       y = "Count") +
  theme_minimal()

## Last updated----
# Convert the 'last_updated' column to Date type
data_apps$Last.Updated <- as.Date(data_apps$Last.Updated, format = "%B %d, %Y")

# Calculate the difference in days between the maximum date and each date in 'last_updated'
data_apps$Last.Updated <- as.numeric(difftime(max(data_apps$Last.Updated, na.rm = TRUE), 
                                       data_apps$Last.Updated, 
                                       units = "days"))
# Display summary using kable
summary(data_apps) %>%
  kable(caption = "Summary of Google Play Store Apps") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE) %>%
  scroll_box(width = "100%", height = "400px")
Summary of Google Play Store Apps
Rating Reviews Size Installs Price Content.Rating Last.Updated catART_AND_DESIGN catAUTO_AND_VEHICLES catBEAUTY catBOOKS_AND_REFERENCE catBUSINESS catCOMICS catCOMMUNICATION catDATING catEDUCATION catENTERTAINMENT catEVENTS catFAMILY catFINANCE catFOOD_AND_DRINK catGAME catHEALTH_AND_FITNESS catHOUSE_AND_HOME catLIBRARIES_AND_DEMO catLIFESTYLE catMAPS_AND_NAVIGATION catMEDICAL catNEWS_AND_MAGAZINES catPARENTING catPERSONALIZATION catPHOTOGRAPHY catPRODUCTIVITY catSHOPPING catSOCIAL catSPORTS catTOOLS catTRAVEL_AND_LOCAL catVIDEO_PLAYERS catWEATHER Installs_Category
Min. :1.000 Min. : 0 Min. : 0.0085 Min. :0.000e+00 Min. : 0.000 Min. :1.000 Min. : 0.0 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.000000 Min. :0.000000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.000000 0:5890
1st Qu.:4.100 1st Qu.: 38 1st Qu.: 5.9000 1st Qu.:1.000e+03 1st Qu.: 0.000 1st Qu.:2.000 1st Qu.: 19.0 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000000 1:4950
Median :4.200 Median : 2094 Median : 14.5000 Median :1.000e+05 Median : 0.000 Median :2.000 Median : 76.0 Median :0.000000 Median :0.000000 Median :0.000000 Median :0.00000 Median :0.00000 Median :0.000000 Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.000000 Median :0.0000 Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000 Median :0.000000 Median :0.000000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.000000 Median :0.00000 Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000 Median :0.000000 NA
Mean :4.192 Mean : 444153 Mean : 21.2119 Mean :1.546e+07 Mean : 1.027 Mean :2.465 Mean : 259.7 Mean :0.005996 Mean :0.007841 Mean :0.004889 Mean :0.02131 Mean :0.04244 Mean :0.005535 Mean :0.0357 Mean :0.02159 Mean :0.01439 Mean :0.01375 Mean :0.005904 Mean :0.1819 Mean :0.03376 Mean :0.01172 Mean :0.1055 Mean :0.03146 Mean :0.008118 Mean :0.007841 Mean :0.03524 Mean :0.01264 Mean :0.04271 Mean :0.02611 Mean :0.005535 Mean :0.03616 Mean :0.0309 Mean :0.03911 Mean :0.02399 Mean :0.02721 Mean :0.03542 Mean :0.07777 Mean :0.0238 Mean :0.01614 Mean :0.007565 NA
3rd Qu.:4.500 3rd Qu.: 54776 3rd Qu.: 28.0000 3rd Qu.:5.000e+06 3rd Qu.: 0.000 3rd Qu.:2.000 3rd Qu.: 322.0 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.000000 NA
Max. :5.000 Max. :78158306 Max. :100.0000 Max. :1.000e+09 Max. :400.000 Max. :6.000 Max. :3001.0 Max. :1.000000 Max. :1.000000 Max. :1.000000 Max. :1.00000 Max. :1.00000 Max. :1.000000 Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.000000 Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.000000 Max. :1.000000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.000000 Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.000000 NA

Logistic Regression

KNN

Splitting the Data into Train and Test

# Split the data into features and target
features <- c("Rating", "Reviews", "Size", "Price", "Content.Rating", "Last.Updated",
              grep("^cat", names(data_apps), value = TRUE))
X <- data_apps[, features]
y <- data_apps$Installs_Category

# Split the data into training and testing sets
set.seed(123)
train_index <- createDataPartition(y, p = 0.7, list = FALSE)
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]

# Scale the features
preprocess_params <- preProcess(X_train, method = c("center", "scale"))
X_train_scaled <- predict(preprocess_params, X_train)
X_test_scaled <- predict(preprocess_params, X_test)

Find optimal K using Cross validation

# Train KNN model with cross-validation to find optimal k
k_values <- seq(1, 20, by = 2)
cv_results <- lapply(k_values, function(k) {
  train_control <- trainControl(method = "cv", number = 5)
  knn_model <- train(x = X_train_scaled, 
                    y = y_train,
                    method = "knn",
                    tuneGrid = data.frame(k = k),
                    trControl = train_control)
  return(knn_model$results$Accuracy)
})

# Plot k values vs accuracy
k_accuracy_df <- data.frame(k = k_values, accuracy = unlist(cv_results))
ggplot(k_accuracy_df, aes(x = k, y = accuracy)) +
  geom_line(color = "blue") +
  geom_point(color = "red") +
  labs(title = "K Values vs Cross-Validation Accuracy",
       x = "Number of Neighbors (k)",
       y = "Accuracy") +
  theme_minimal()

As we can see, the most optimal k is found by using CV and accuracy as metric, highest accuracy of about 78% is achieved.

Evaluation metrics for test data using optimal k

optimal_k <- k_values[which.max(cv_results)]
final_knn <- knn3(X_train_scaled, y_train, k = optimal_k)

# Make predictions
y_pred_prob <- predict(final_knn, X_test_scaled, type = "prob")
y_pred <- factor(ifelse(y_pred_prob[,2] > 0.5, 1, 0), levels = c(0, 1))


y_test_numeric <- as.numeric(as.character(y_test))

# Calculate performance metrics
conf_matrix <- confusionMatrix(y_pred, y_test)
roc_curve <- roc(as.numeric(y_test_numeric), y_pred_prob[,2])
auc_score <- auc(roc_curve)

# Create performance summary table
performance_summary <- data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1 Score", "AUC"),
  Value = c(
    conf_matrix$overall["Accuracy"],
    conf_matrix$byClass["Precision"],
    conf_matrix$byClass["Recall"],
    conf_matrix$byClass["F1"],
    auc_score
  )
)
# Display performance summary using kable
kable(performance_summary, caption = "KNN Model Performance Summary") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
KNN Model Performance Summary
Metric Value
Accuracy Accuracy 0.7610701
Precision Precision 0.7857968
Recall Recall 0.7702320
F1 F1 Score 0.7779366
AUC 0.8324551
#Confusion matrix heat map
conf_matrix_df <- as.data.frame(conf_matrix$table)
conf_matrix_df$Percentage <- conf_matrix_df$Freq / sum(conf_matrix_df$Freq) * 100

ggplot(conf_matrix_df, aes(x = Prediction, y = Reference)) +
  geom_tile(aes(fill = Percentage), color = "white") +
  geom_text(aes(label = sprintf("%.1f%%\n(n=%d)", Percentage, Freq)), 
            size = 4, color = "black") +
  scale_fill_gradient2(low = "white", high = "#4A90E2", 
                      midpoint = min(conf_matrix_df$Percentage)) +
  labs(title = "Confusion Matrix Heatmap",
       subtitle = sprintf("Overall Accuracy: %.1f%%", conf_matrix$overall["Accuracy"] * 100)) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    axis.text = element_text(size = 10),
    legend.title = element_text(size = 10)
  )

# Plot ROC curve
plot(roc_curve, main = "ROC Curve for KNN Model",
     col = "blue", lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "gray")
legend("bottomright", legend = sprintf("AUC = %.3f", auc_score))

The AUC for the model is 0.83 which is good. But the performance metrics such as accuracy, precision are anticipated to have an above 80% score.

Feature importance

calc_importance <- function(feature) {
  X_test_permuted <- X_test_scaled
  X_test_permuted[,feature] <- sample(X_test_scaled[,feature])
  pred_permuted <- predict(final_knn, X_test_permuted, type = "prob")
  roc_permuted <- roc(as.numeric(y_test), pred_permuted[,2])
  return(auc_score - auc(roc_permuted))
}

importance_scores <- sapply(features, calc_importance)
importance_df <- data.frame(
  Feature = features,
  Importance = importance_scores
)
importance_df <- importance_df[order(-importance_df$Importance),]

# Plot feature importance
ggplot(importance_df[1:39,], aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Feature Importance (KNN Model)",
       x = "Feature",
       y = "Importance Score") +
  theme_minimal()

The top 5 features contributing to the model are Size, Rating, Last updated, Content Rating and Reviews.

#————————————————————————-

Decision Tree

Splitting the data into train and test split

# Remove the Installs and Installs numerical columns
data <- data_apps[, !colnames(data_apps) %in% c("Installs")]

# Split the data into training and testing sets
set.seed(123)  # Ensure reproducibility
trainIndex <- createDataPartition(data$Installs_Category, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]
# Fit the decision tree model
set.seed(42)
tree_model <- rpart(
  Installs_Category ~ . ,
  data = trainData,
  method = "class"
)

# Plot the decision tree
rpart.plot(tree_model, main = "Decision Tree for Predicting Installs Category")

# Predict on training and test datasets
train_predictions <- predict(tree_model, trainData, type = "class")
test_predictions <- predict(tree_model, testData, type = "class")



# Calculate accuracy
train_accuracy <- sum(train_predictions == trainData$Installs_Category) / nrow(trainData)
test_accuracy <- sum(test_predictions == testData$Installs_Category) / nrow(testData)

# Print accuracy results
cat("Training Accuracy: ", train_accuracy, "\n")
## Training Accuracy:  0.9519142
cat("Test Accuracy: ", test_accuracy, "\n")
## Test Accuracy:  0.9488007

Why shift to Random Forest? High Dimensionality: With 41 variables, random forest handles many features better and can identify the most important ones.
Feature Importance: Random forest provides a ranking of feature importance, helping us understand which variables influence the Installs_Category.
Accuracy: Random forest generally has better predictive accuracy for larger and more complex datasets.

Random Forest

In this analysis, we employ a Random Forest model to predict the number of installs based on the top 5 app categories. The Random Forest algorithm is a robust ensemble learning method that builds multiple decision trees and combines their predictions to improve accuracy and reduce overfitting.

Building Random FOrest Classifier

# Train the random forest model
set.seed(123)
rf_model <- randomForest(Installs_Category ~ .,
                         data = trainData,
                         ntree = 500,       # Number of trees
                         mtry = 4,  # Number of predictors sampled at each split
                         importance = TRUE) # Enable importance calculation

# Print the model summary
print(rf_model)
## 
## Call:
##  randomForest(formula = Installs_Category ~ ., data = trainData,      ntree = 500, mtry = 4, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 4.35%
## Confusion matrix:
##      0    1 class.error
## 0 4547  165  0.03501698
## 1  212 3748  0.05353535

Plotting the Random Forest Result

# Plot the Random Forest model
plot(rf_model, main = "Random Forest Model Performance")

# Add a legend to explain the colors
legend("topright",                              
       legend = c("OOB Error", "Class 1 Error", "Class 2 Error"), 
       col = c("black", "red", "green"),       
       lty = 1,                                
       cex = 0.8)   

Testing the accuracy

# Predictions on the training set
train_predictions <- predict(rf_model, trainData)

# Predictions on the testing set
test_predictions <- predict(rf_model, testData)


# Confusion Matrix for Training Data
 train_cm <- confusionMatrix(train_predictions, trainData$Installs_Category)
 print(train_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4617  122
##          1   95 3838
##                                           
##                Accuracy : 0.975           
##                  95% CI : (0.9715, 0.9782)
##     No Information Rate : 0.5434          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9495          
##                                           
##  Mcnemar's Test P-Value : 0.07756         
##                                           
##             Sensitivity : 0.9798          
##             Specificity : 0.9692          
##          Pos Pred Value : 0.9743          
##          Neg Pred Value : 0.9758          
##              Prevalence : 0.5434          
##          Detection Rate : 0.5324          
##    Detection Prevalence : 0.5465          
##       Balanced Accuracy : 0.9745          
##                                           
##        'Positive' Class : 0               
## 
# Confusion Matrix for Testing Data
test_cm <- confusionMatrix(test_predictions, testData$Installs_Category)
print(test_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1132   53
##          1   46  937
##                                           
##                Accuracy : 0.9543          
##                  95% CI : (0.9447, 0.9627)
##     No Information Rate : 0.5434          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9079          
##                                           
##  Mcnemar's Test P-Value : 0.5465          
##                                           
##             Sensitivity : 0.9610          
##             Specificity : 0.9465          
##          Pos Pred Value : 0.9553          
##          Neg Pred Value : 0.9532          
##              Prevalence : 0.5434          
##          Detection Rate : 0.5221          
##    Detection Prevalence : 0.5466          
##       Balanced Accuracy : 0.9537          
##                                           
##        'Positive' Class : 0               
## 

Checking for training, testing and OOB datasets

# Performance on Training Data
train_predictions <- predict(rf_model, newdata = trainData)
train_conf_matrix <- table(train_predictions, trainData$Installs_Category)
train_accuracy <- sum(diag(train_conf_matrix)) / sum(train_conf_matrix)
cat("Training Accuracy: ", train_accuracy, "\n")
## Training Accuracy:  0.9749769
# Performance on Testing Data
test_predictions <- predict(rf_model, newdata = testData)
test_conf_matrix <- table(test_predictions, testData$Installs_Category)
test_accuracy <- sum(diag(test_conf_matrix)) / sum(test_conf_matrix)
cat("Testing Accuracy: ", test_accuracy, "\n")
## Testing Accuracy:  0.9543358
# OOB Error from the model
oob_error <- rf_model$err.rate[500, "OOB"]
oob_accuracy <- 1 - oob_error
cat("OOB Accuracy: ", oob_accuracy, "\n")
## OOB Accuracy:  0.9565268
# Compare Results
comparison <- data.frame(
  Dataset = c("Training", "Testing", "OOB"),
  Accuracy = c(train_accuracy, test_accuracy, oob_accuracy)
)
print(comparison)
##    Dataset  Accuracy
## 1 Training 0.9749769
## 2  Testing 0.9543358
## 3      OOB 0.9565268

AUC/ROC

# Install and load pROC package if not already installed
if (!require("pROC")) install.packages("pROC", dependencies = TRUE)

# Get predicted probabilities for the positive class (class '1')
rf_prob <- predict(rf_model, testData, type = "prob")[, 2] # Probabilities for class '1'

# Compute ROC curve and AUC
roc_curve <- roc(testData$Installs_Category, rf_prob)

# Plot the ROC curve
plot(roc_curve, col = "blue", lwd = 2, main = "ROC Curve for Random Forest")
abline(a = 0, b = 1, lty = 2, col = "gray") # Diagonal reference line

# Display the AUC value
auc_value <- auc(roc_curve)
cat("AUC:", auc_value, "\n")
## AUC: 0.9881922

Feature Importance Values

# Variable importance
importance(rf_model)
##                                 0            1 MeanDecreaseAccuracy
## Rating                 15.1142447  19.94723199           22.5997876
## Reviews                90.0067853 100.26035812          101.9665752
## Size                   11.8994820  17.27289181           20.2294097
## Price                  37.1396369  41.59484288           45.7099624
## Content.Rating         10.1721956  10.05626857           13.8741454
## Last.Updated           17.5212482  16.76819395           24.0306055
## catART_AND_DESIGN      -1.7119601  -0.45450740           -1.6004086
## catAUTO_AND_VEHICLES    2.1287644   7.30700185            7.0864851
## catBEAUTY              -1.6937161   4.43340758            2.3915006
## catBOOKS_AND_REFERENCE -1.4109002   2.40811737            0.9254431
## catBUSINESS            -2.9786640  11.46650484           10.2307574
## catCOMICS               2.6287264   2.40038960            3.7801295
## catCOMMUNICATION        0.3875751  -0.94118112           -0.3435919
## catDATING               4.8477990  -0.39074922            3.4658472
## catEDUCATION           12.0764045   7.85370158           12.9867373
## catENTERTAINMENT       14.4382698   1.32379128           14.7495594
## catEVENTS               6.6402383  10.65844758           11.2020113
## catFAMILY              -5.9215812   6.69886354            2.4785553
## catFINANCE              0.8756648   5.90079226            5.1381823
## catFOOD_AND_DRINK      -4.4615018   0.43503127           -3.1082568
## catGAME                12.3678276   0.23835409           12.5083958
## catHEALTH_AND_FITNESS   1.4680166   3.83750346            3.7565491
## catHOUSE_AND_HOME      -1.2091642   2.71761141            1.1205213
## catLIBRARIES_AND_DEMO   1.0719952   7.41387595            7.0258040
## catLIFESTYLE           -1.7178046   6.10759331            5.2129387
## catMAPS_AND_NAVIGATION -3.7395597   1.04149969           -1.2763125
## catMEDICAL              6.6464141  20.04470310           20.3154344
## catNEWS_AND_MAGAZINES   1.5084049   6.83054946            6.2625470
## catPARENTING           -1.5158336   4.68584942            2.3270685
## catPERSONALIZATION     -0.1349033   3.41111809            2.6745840
## catPHOTOGRAPHY         14.3055236   5.00877316           15.4154186
## catPRODUCTIVITY        -4.6589723   1.97159358           -1.0439910
## catSHOPPING             7.9015188   1.67789040            8.5681436
## catSOCIAL               3.0757349  -0.93364078            2.3564318
## catSPORTS               1.6490382   0.37976680            1.3831812
## catTOOLS               -3.8679728   0.19000491           -1.6527781
## catTRAVEL_AND_LOCAL    -2.8476202   0.61812514           -1.7685359
## catVIDEO_PLAYERS       -0.7382980  -0.01421662           -0.5999692
## catWEATHER              3.2343068  -2.54911142            1.2265776
##                        MeanDecreaseGini
## Rating                       258.205076
## Reviews                     2341.076881
## Size                         202.836810
## Price                        105.230783
## Content.Rating                32.068778
## Last.Updated                 276.565693
## catART_AND_DESIGN              2.632787
## catAUTO_AND_VEHICLES           4.148712
## catBEAUTY                      2.755840
## catBOOKS_AND_REFERENCE         4.978577
## catBUSINESS                   12.245818
## catCOMICS                      2.994492
## catCOMMUNICATION               4.271465
## catDATING                      5.816032
## catEDUCATION                   9.973986
## catENTERTAINMENT              13.860234
## catEVENTS                      4.870598
## catFAMILY                     13.905954
## catFINANCE                     5.811543
## catFOOD_AND_DRINK              2.055011
## catGAME                       40.202925
## catHEALTH_AND_FITNESS          5.479474
## catHOUSE_AND_HOME              2.958173
## catLIBRARIES_AND_DEMO          4.038455
## catLIFESTYLE                   6.265143
## catMAPS_AND_NAVIGATION         1.791088
## catMEDICAL                    43.762990
## catNEWS_AND_MAGAZINES          4.494912
## catPARENTING                   2.847382
## catPERSONALIZATION             4.814103
## catPHOTOGRAPHY                18.729702
## catPRODUCTIVITY                4.511257
## catSHOPPING                    7.480957
## catSOCIAL                      4.087498
## catSPORTS                      3.295354
## catTOOLS                       7.597817
## catTRAVEL_AND_LOCAL            2.957056
## catVIDEO_PLAYERS               3.473253
## catWEATHER                     3.001699
# Plot variable importance
varImpPlot(rf_model)

#### Visualization for Feature Importance

# Extract importance values
importance_values <- importance(rf_model)
importance_df <- data.frame(
  Feature = rownames(importance_values),
  MeanDecreaseAccuracy = importance_values[, "MeanDecreaseAccuracy"],
  MeanDecreaseGini = importance_values[, "MeanDecreaseGini"]
)
# Plot Mean Decrease in Accuracy
accuracy_plot <- ggplot(importance_df, aes(x = reorder(Feature, MeanDecreaseAccuracy), y = MeanDecreaseAccuracy)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(
    title = "Feature Importance (Mean Decrease in Accuracy)",
    x = "Features",
    y = "Importance"
  ) +
  theme_minimal() +
  theme(text = element_text(size = 12), axis.text.y = element_text(size = 10))

# Plot the accuracy plot
print(accuracy_plot)

# Save the plot with larger dimensions
# ggsave("feature_importance_accuracy_large.png", plot = accuracy_plot, width = 12, height = 10, dpi = 300)

# Plot Mean Decrease in Gini
gini_plot <- ggplot(importance_df, aes(x = reorder(Feature, MeanDecreaseGini), y = MeanDecreaseGini)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  coord_flip() +
  labs(
    title = "Feature Importance (Mean Decrease in Gini)",
    x = "Features",
    y = "Importance"
  ) +
  theme_minimal() +
  theme(text = element_text(size = 12), axis.text.y = element_text(size = 10))

# Plot the gini index plot
print(gini_plot)

# Save the plot with larger dimensions
# ggsave("feature_importance_gini_large.png", plot = gini_plot, width = 12, height = 10, dpi = 300)

Gradient Boosting

#  Data Splitting -----------------------------------------------------

# Separate features (X) and target (y)
#X <- data_final %>% select(-Installs, -Success)
X <- data_apps %>% select(-Installs, -Installs_Category)  # Exclude the target variable
y <- data_apps$Installs_Category             # Extract the target variable


table(y)
## y
##    0    1 
## 5890 4950
#Split into training and testing sets
set.seed(123)
train_index <- createDataPartition(y, p = 0.7, list = FALSE)

# Define X_train, X_test, y_train, y_test
X_train <- X[train_index, ] %>% mutate(across(everything(), as.numeric))
X_test <- X[-train_index, ] %>% mutate(across(everything(), as.numeric))
y_train <- as.numeric(as.character(y[train_index]))  # Convert to numeric
y_test <- as.numeric(as.character(y[-train_index]))  # Convert to numeric
# Convert data to matrix for XGBoost
dtrain <- xgb.DMatrix(data = as.matrix(X_train), label = y_train)
dtest <- xgb.DMatrix(data = as.matrix(X_test), label = y_test)

data_apps$Installs_Category <- factor(data_apps$Installs_Category, levels = c(0, 1))
# 4. Train Gradient Boosting Model --------------------------------------

params <- list(
  objective = "binary:logistic",  # Binary classification
  eval_metric = "logloss",
  max_depth = 6,
  eta = 0.1,
  subsample = 0.8,
  colsample_bytree = 0.8
)

# Train the model
set.seed(42)
xgb_model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = 100,
  watchlist = list(train = dtrain, test = dtest),
  early_stopping_rounds = 10,
  verbose = 1
)
## [1]  train-logloss:0.609051  test-logloss:0.610845 
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 10 rounds.
## 
## [2]  train-logloss:0.540069  test-logloss:0.543209 
## [3]  train-logloss:0.482265  test-logloss:0.486837 
## [4]  train-logloss:0.434053  test-logloss:0.440368 
## [5]  train-logloss:0.392603  test-logloss:0.399988 
## [6]  train-logloss:0.357633  test-logloss:0.366472 
## [7]  train-logloss:0.326262  test-logloss:0.335497 
## [8]  train-logloss:0.299307  test-logloss:0.309752 
## [9]  train-logloss:0.275127  test-logloss:0.286471 
## [10] train-logloss:0.254064  test-logloss:0.266038 
## [11] train-logloss:0.246028  test-logloss:0.258059 
## [12] train-logloss:0.228166  test-logloss:0.241110 
## [13] train-logloss:0.212535  test-logloss:0.226162 
## [14] train-logloss:0.199030  test-logloss:0.213673 
## [15] train-logloss:0.186369  test-logloss:0.202031 
## [16] train-logloss:0.175329  test-logloss:0.191756 
## [17] train-logloss:0.165163  test-logloss:0.182054 
## [18] train-logloss:0.155909  test-logloss:0.173148 
## [19] train-logloss:0.147937  test-logloss:0.165724 
## [20] train-logloss:0.140798  test-logloss:0.159106 
## [21] train-logloss:0.134419  test-logloss:0.153387 
## [22] train-logloss:0.128485  test-logloss:0.147681 
## [23] train-logloss:0.126779  test-logloss:0.146159 
## [24] train-logloss:0.121702  test-logloss:0.141489 
## [25] train-logloss:0.120300  test-logloss:0.140252 
## [26] train-logloss:0.115684  test-logloss:0.136209 
## [27] train-logloss:0.111563  test-logloss:0.132826 
## [28] train-logloss:0.107759  test-logloss:0.129496 
## [29] train-logloss:0.104358  test-logloss:0.126577 
## [30] train-logloss:0.101191  test-logloss:0.124088 
## [31] train-logloss:0.098429  test-logloss:0.121812 
## [32] train-logloss:0.096048  test-logloss:0.119849 
## [33] train-logloss:0.093736  test-logloss:0.118078 
## [34] train-logloss:0.091855  test-logloss:0.116222 
## [35] train-logloss:0.089978  test-logloss:0.114691 
## [36] train-logloss:0.088334  test-logloss:0.113315 
## [37] train-logloss:0.086624  test-logloss:0.111863 
## [38] train-logloss:0.085822  test-logloss:0.111504 
## [39] train-logloss:0.084222  test-logloss:0.110214 
## [40] train-logloss:0.083036  test-logloss:0.109247 
## [41] train-logloss:0.081971  test-logloss:0.108476 
## [42] train-logloss:0.080807  test-logloss:0.107580 
## [43] train-logloss:0.079924  test-logloss:0.106761 
## [44] train-logloss:0.078889  test-logloss:0.106185 
## [45] train-logloss:0.077890  test-logloss:0.105556 
## [46] train-logloss:0.077019  test-logloss:0.104972 
## [47] train-logloss:0.076692  test-logloss:0.104888 
## [48] train-logloss:0.076087  test-logloss:0.104560 
## [49] train-logloss:0.075228  test-logloss:0.103823 
## [50] train-logloss:0.074563  test-logloss:0.103500 
## [51] train-logloss:0.073904  test-logloss:0.103389 
## [52] train-logloss:0.073342  test-logloss:0.103047 
## [53] train-logloss:0.072766  test-logloss:0.102632 
## [54] train-logloss:0.072190  test-logloss:0.102346 
## [55] train-logloss:0.071809  test-logloss:0.102039 
## [56] train-logloss:0.071340  test-logloss:0.101941 
## [57] train-logloss:0.070922  test-logloss:0.101906 
## [58] train-logloss:0.070537  test-logloss:0.101871 
## [59] train-logloss:0.070109  test-logloss:0.101728 
## [60] train-logloss:0.069795  test-logloss:0.101409 
## [61] train-logloss:0.069435  test-logloss:0.101432 
## [62] train-logloss:0.069234  test-logloss:0.101196 
## [63] train-logloss:0.068961  test-logloss:0.100983 
## [64] train-logloss:0.068728  test-logloss:0.101066 
## [65] train-logloss:0.068271  test-logloss:0.101246 
## [66] train-logloss:0.067911  test-logloss:0.101329 
## [67] train-logloss:0.067658  test-logloss:0.101267 
## [68] train-logloss:0.067465  test-logloss:0.101118 
## [69] train-logloss:0.066696  test-logloss:0.101152 
## [70] train-logloss:0.066292  test-logloss:0.101252 
## [71] train-logloss:0.066050  test-logloss:0.101360 
## [72] train-logloss:0.065654  test-logloss:0.101406 
## [73] train-logloss:0.065511  test-logloss:0.101405 
## Stopping. Best iteration:
## [63] train-logloss:0.068961  test-logloss:0.100983
# 5. Model Evaluation ---------------------------------------------------

# Make predictions
y_pred <- predict(xgb_model, dtest)
y_pred_class <- ifelse(y_pred > 0.5, 1, 0)

# Confusion Matrix
y_pred_class <- factor(y_pred_class, levels = c(0, 1))
y_test <- factor(y_test, levels = c(0, 1))
conf_matrix <- confusionMatrix(y_pred_class, y_test)
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1698   67
##          1   69 1418
##                                           
##                Accuracy : 0.9582          
##                  95% CI : (0.9507, 0.9648)
##     No Information Rate : 0.5434          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9157          
##                                           
##  Mcnemar's Test P-Value : 0.9317          
##                                           
##             Sensitivity : 0.9610          
##             Specificity : 0.9549          
##          Pos Pred Value : 0.9620          
##          Neg Pred Value : 0.9536          
##              Prevalence : 0.5434          
##          Detection Rate : 0.5221          
##    Detection Prevalence : 0.5427          
##       Balanced Accuracy : 0.9579          
##                                           
##        'Positive' Class : 0               
## 

Confusion Matrix:

# AUC and ROC Curve
roc_obj <- roc(as.numeric(as.character(y_test)), y_pred)
auc_value <- auc(roc_obj)
cat("AUC:", auc_value, "\n")
## AUC: 0.993891
# Plot ROC Curve
plot(roc_obj, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "red")

# 6. Feature Importance -------------------------------------------------

importance_matrix <- xgb.importance(feature_names = colnames(X_train), model = xgb_model)
xgb.plot.importance(importance_matrix, top_n = 10, main = "Feature Importance")

# 7. Save Model ---------------------------------------------------------

xgb.save(xgb_model, "xgb_app_success.model")
## [1] TRUE
# Summary
cat("Gradient Boosting achieved an accuracy of", conf_matrix$overall["Accuracy"],
    "and AUC of", auc_value, "\n")
## Gradient Boosting achieved an accuracy of 0.9581796 and AUC of 0.993891

SVM

# Convert target variable to a factor
# Split the data into features and target
y <- as.factor(data_apps$Installs_Category)
X <- data_apps[, !names(data_apps) %in% c('Installs_Category', 'Installs')]

# Split data into training and testing sets
set.seed(42)
trainIndex <- createDataPartition(y, p = 0.75, list = FALSE)
X_train <- X[trainIndex, ]
X_test <- X[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]

# Normalize data to [0, 1] range using min-max scaling
preprocess_params <- preProcess(X_train, method = "range")  # Min-Max Normalization
X_train <- predict(preprocess_params, X_train)
X_test <- predict(preprocess_params, X_test)

# Check the normalized data
summary(X_train)
##      Rating          Reviews               Size             Price         
##  Min.   :0.0000   Min.   :0.0000000   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.7750   1st Qu.:0.0000005   1st Qu.:0.05892   1st Qu.:0.000000  
##  Median :0.8000   Median :0.0000268   Median :0.14793   Median :0.000000  
##  Mean   :0.7987   Mean   :0.0056732   Mean   :0.21192   Mean   :0.002586  
##  3rd Qu.:0.8750   3rd Qu.:0.0006895   3rd Qu.:0.27994   3rd Qu.:0.000000  
##  Max.   :1.0000   Max.   :1.0000000   Max.   :1.00000   Max.   :1.000000  
##  Content.Rating    Last.Updated      catART_AND_DESIGN  catAUTO_AND_VEHICLES
##  Min.   :0.0000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000    
##  1st Qu.:0.2500   1st Qu.:0.007032   1st Qu.:0.000000   1st Qu.:0.000000    
##  Median :0.2500   Median :0.028127   Median :0.000000   Median :0.000000    
##  Mean   :0.3645   Mean   :0.095668   Mean   :0.006149   Mean   :0.007871    
##  3rd Qu.:0.2500   3rd Qu.:0.118801   3rd Qu.:0.000000   3rd Qu.:0.000000    
##  Max.   :1.0000   Max.   :1.000000   Max.   :1.000000   Max.   :1.000000    
##    catBEAUTY        catBOOKS_AND_REFERENCE  catBUSINESS       catCOMICS       
##  Min.   :0.000000   Min.   :0.00000        Min.   :0.0000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.00000        1st Qu.:0.0000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.00000        Median :0.0000   Median :0.000000  
##  Mean   :0.004797   Mean   :0.02214        Mean   :0.0428   Mean   :0.005657  
##  3rd Qu.:0.000000   3rd Qu.:0.00000        3rd Qu.:0.0000   3rd Qu.:0.000000  
##  Max.   :1.000000   Max.   :1.00000        Max.   :1.0000   Max.   :1.000000  
##  catCOMMUNICATION    catDATING        catEDUCATION     catENTERTAINMENT 
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.03628   Mean   :0.02054   Mean   :0.01402   Mean   :0.01439  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##    catEVENTS          catFAMILY        catFINANCE      catFOOD_AND_DRINK
##  Min.   :0.000000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.000000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.006149   Mean   :0.1792   Mean   :0.03394   Mean   :0.01205  
##  3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.000000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##     catGAME       catHEALTH_AND_FITNESS catHOUSE_AND_HOME 
##  Min.   :0.0000   Min.   :0.00000       Min.   :0.000000  
##  1st Qu.:0.0000   1st Qu.:0.00000       1st Qu.:0.000000  
##  Median :0.0000   Median :0.00000       Median :0.000000  
##  Mean   :0.1028   Mean   :0.03247       Mean   :0.008609  
##  3rd Qu.:0.0000   3rd Qu.:0.00000       3rd Qu.:0.000000  
##  Max.   :1.0000   Max.   :1.00000       Max.   :1.000000  
##  catLIBRARIES_AND_DEMO  catLIFESTYLE     catMAPS_AND_NAVIGATION
##  Min.   :0.000000      Min.   :0.00000   Min.   :0.00000       
##  1st Qu.:0.000000      1st Qu.:0.00000   1st Qu.:0.00000       
##  Median :0.000000      Median :0.00000   Median :0.00000       
##  Mean   :0.007994      Mean   :0.03345   Mean   :0.01267       
##  3rd Qu.:0.000000      3rd Qu.:0.00000   3rd Qu.:0.00000       
##  Max.   :1.000000      Max.   :1.00000   Max.   :1.00000       
##    catMEDICAL      catNEWS_AND_MAGAZINES  catPARENTING     catPERSONALIZATION
##  Min.   :0.00000   Min.   :0.00000       Min.   :0.00000   Min.   :0.00000   
##  1st Qu.:0.00000   1st Qu.:0.00000       1st Qu.:0.00000   1st Qu.:0.00000   
##  Median :0.00000   Median :0.00000       Median :0.00000   Median :0.00000   
##  Mean   :0.04341   Mean   :0.02743       Mean   :0.00578   Mean   :0.03493   
##  3rd Qu.:0.00000   3rd Qu.:0.00000       3rd Qu.:0.00000   3rd Qu.:0.00000   
##  Max.   :1.00000   Max.   :1.00000       Max.   :1.00000   Max.   :1.00000   
##  catPHOTOGRAPHY   catPRODUCTIVITY    catSHOPPING        catSOCIAL      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.0305   Mean   :0.04071   Mean   :0.02398   Mean   :0.02656  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##    catSPORTS          catTOOLS       catTRAVEL_AND_LOCAL catVIDEO_PLAYERS 
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000     Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000     1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000     Median :0.00000  
##  Mean   :0.03665   Mean   :0.07699   Mean   :0.02558     Mean   :0.01623  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000     3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000     Max.   :1.00000  
##    catWEATHER      
##  Min.   :0.000000  
##  1st Qu.:0.000000  
##  Median :0.000000  
##  Mean   :0.007256  
##  3rd Qu.:0.000000  
##  Max.   :1.000000
summary(X_test)
##      Rating          Reviews               Size              Price         
##  Min.   :0.0000   Min.   :0.0000000   Min.   :0.000085   Min.   :0.000000  
##  1st Qu.:0.7750   1st Qu.:0.0000005   1st Qu.:0.056920   1st Qu.:0.000000  
##  Median :0.8000   Median :0.0000269   Median :0.144927   Median :0.000000  
##  Mean   :0.7956   Mean   :0.0057113   Mean   :0.212437   Mean   :0.002517  
##  3rd Qu.:0.8750   3rd Qu.:0.0007190   3rd Qu.:0.279939   3rd Qu.:0.000000  
##  Max.   :1.0000   Max.   :0.8843502   Max.   :1.000000   Max.   :1.000025  
##  Content.Rating    Last.Updated      catART_AND_DESIGN  catAUTO_AND_VEHICLES
##  Min.   :0.2500   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000    
##  1st Qu.:0.2500   1st Qu.:0.007032   1st Qu.:0.000000   1st Qu.:0.000000    
##  Median :0.2500   Median :0.028127   Median :0.000000   Median :0.000000    
##  Mean   :0.3714   Mean   :0.097483   Mean   :0.005537   Mean   :0.007752    
##  3rd Qu.:0.2500   3rd Qu.:0.125463   3rd Qu.:0.000000   3rd Qu.:0.000000    
##  Max.   :1.2500   Max.   :1.110659   Max.   :1.000000   Max.   :1.000000    
##    catBEAUTY        catBOOKS_AND_REFERENCE  catBUSINESS        catCOMICS       
##  Min.   :0.000000   Min.   :0.00000        Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.00000        1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.00000        Median :0.00000   Median :0.000000  
##  Mean   :0.005168   Mean   :0.01883        Mean   :0.04134   Mean   :0.005168  
##  3rd Qu.:0.000000   3rd Qu.:0.00000        3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :1.000000   Max.   :1.00000        Max.   :1.00000   Max.   :1.000000  
##  catCOMMUNICATION    catDATING        catEDUCATION    catENTERTAINMENT 
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.03396   Mean   :0.02473   Mean   :0.0155   Mean   :0.01181  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##    catEVENTS          catFAMILY        catFINANCE      catFOOD_AND_DRINK
##  Min.   :0.000000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.000000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.005168   Mean   :0.1901   Mean   :0.03322   Mean   :0.01071  
##  3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.000000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##     catGAME       catHEALTH_AND_FITNESS catHOUSE_AND_HOME 
##  Min.   :0.0000   Min.   :0.00000       Min.   :0.000000  
##  1st Qu.:0.0000   1st Qu.:0.00000       1st Qu.:0.000000  
##  Median :0.0000   Median :0.00000       Median :0.000000  
##  Mean   :0.1137   Mean   :0.02842       Mean   :0.006645  
##  3rd Qu.:0.0000   3rd Qu.:0.00000       3rd Qu.:0.000000  
##  Max.   :1.0000   Max.   :1.00000       Max.   :1.000000  
##  catLIBRARIES_AND_DEMO  catLIFESTYLE     catMAPS_AND_NAVIGATION
##  Min.   :0.000000      Min.   :0.00000   Min.   :0.00000       
##  1st Qu.:0.000000      1st Qu.:0.00000   1st Qu.:0.00000       
##  Median :0.000000      Median :0.00000   Median :0.00000       
##  Mean   :0.007383      Mean   :0.04061   Mean   :0.01255       
##  3rd Qu.:0.000000      3rd Qu.:0.00000   3rd Qu.:0.00000       
##  Max.   :1.000000      Max.   :1.00000   Max.   :1.00000       
##    catMEDICAL      catNEWS_AND_MAGAZINES  catPARENTING      catPERSONALIZATION
##  Min.   :0.00000   Min.   :0.00000       Min.   :0.000000   Min.   :0.00000   
##  1st Qu.:0.00000   1st Qu.:0.00000       1st Qu.:0.000000   1st Qu.:0.00000   
##  Median :0.00000   Median :0.00000       Median :0.000000   Median :0.00000   
##  Mean   :0.04061   Mean   :0.02215       Mean   :0.004799   Mean   :0.03987   
##  3rd Qu.:0.00000   3rd Qu.:0.00000       3rd Qu.:0.000000   3rd Qu.:0.00000   
##  Max.   :1.00000   Max.   :1.00000       Max.   :1.000000   Max.   :1.00000   
##  catPHOTOGRAPHY    catPRODUCTIVITY    catSHOPPING        catSOCIAL      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.03212   Mean   :0.03433   Mean   :0.02399   Mean   :0.02916  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##    catSPORTS          catTOOLS      catTRAVEL_AND_LOCAL catVIDEO_PLAYERS 
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00000     Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000     1st Qu.:0.00000  
##  Median :0.00000   Median :0.0000   Median :0.00000     Median :0.00000  
##  Mean   :0.03175   Mean   :0.0801   Mean   :0.01846     Mean   :0.01587  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000     3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.00000     Max.   :1.00000  
##    catWEATHER     
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.00849  
##  3rd Qu.:0.00000  
##  Max.   :1.00000

Checking if the boundary is non-linear or linear

library(plotly)

# Prepare the plot data
plot_data <- data.frame(X_train, Class = as.factor(y_train))

# Create 3D scatter plot for features 1, 2, and 3
plot_1_2_3 <- plot_ly(data = plot_data, 
                      x = ~X_train[, 1], 
                      y = ~X_train[, 2], 
                      z = ~X_train[, 3], 
                      color = ~Class,
                      colors = c("red", "blue"),  # Set colors for classes (0 and 1)
                      type = 'scatter3d', 
                      mode = 'markers') %>%
  layout(title = "3D Scatter Plot: Feature 1 vs Feature 2 vs Feature 3",
         scene = list(xaxis = list(title = colnames(X_train)[1]),
                      yaxis = list(title = colnames(X_train)[2]),
                      zaxis = list(title = colnames(X_train)[3])))

# Create 3D scatter plot for features 4, 5, and 6
plot_4_5_6 <- plot_ly(data = plot_data, 
                      x = ~X_train[, 4], 
                      y = ~X_train[, 5], 
                      z = ~X_train[, 6], 
                      color = ~Class,
                      colors = c("red", "blue"),  # Set colors for classes (0 and 1)
                      type = 'scatter3d', 
                      mode = 'markers') %>%
  layout(title = "3D Scatter Plot: Feature 4 vs Feature 5 vs Feature 6",
         scene = list(xaxis = list(title = colnames(X_train)[4]),
                      yaxis = list(title = colnames(X_train)[5]),
                      zaxis = list(title = colnames(X_train)[6])))

# Show plots
plot_1_2_3
plot_4_5_6

As we can see we cannot decide if the boundary is linear or non-linear hence, lets make two models linear and non-linear SVM to check which one is a better fit.

Tuning to find the best parameter values for C and Gamma for SVM non-linear

# Load necessary libraries
library(e1071)
library(caret)

# Assuming you have already defined X_train, y_train, X_test, y_test

# Combine the training data into a data frame
train_data <- as.data.frame(cbind(X_train, y_train))

# Set up k-fold cross-validation
set.seed(42)
train_control <- trainControl(method = "cv", number = 5)

# Define the tuning grid for 'C' and 'sigma' (gamma)
tune_grid <- expand.grid(C = c( 0.1, 1, 10, 100),
                          sigma = c(0.5, 1))

# Train the SVM model using radial kernel with the tuning grid
svm_model <- train(y_train ~ ., data = train_data,
                   method = "svmRadial",
                   tuneGrid = tune_grid,
                   trControl = train_control,scaled = TRUE)

# Print the results of the tuning
print(svm_model)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 8131 samples
##   39 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 6504, 6505, 6505, 6506, 6504 
## Resampling results across tuning parameters:
## 
##   C      sigma  Accuracy   Kappa    
##     0.1  0.5    0.7152887  0.4135606
##     0.1  1.0    0.7103702  0.3989988
##     1.0  0.5    0.7653439  0.5245875
##     1.0  1.0    0.7691565  0.5316017
##    10.0  0.5    0.7864964  0.5679965
##    10.0  1.0    0.7775180  0.5503263
##   100.0  0.5    0.7925222  0.5808094
##   100.0  1.0    0.7809624  0.5579036
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.5 and C = 100.
# Best model parameters
best_params <- svm_model$bestTune
cat("Best Parameters:\n")
## Best Parameters:
print(best_params)
##   sigma   C
## 7   0.5 100

As seen for the training set the best accuracy is achieved when C = 100 and gamma is 0.5

Tuning to find the best parameter values for C for SVM linear

# Load necessary libraries
library(e1071)
library(caret)

# Assuming you have already defined X_train, y_train, X_test, y_test

# Combine the training data into a data frame
train_data <- as.data.frame(cbind(X_train, y_train))

# Set up k-fold cross-validation
set.seed(42)
train_control <- trainControl(method = "cv", number = 5)

# Define the tuning grid for 'C' and 'sigma' (gamma)
tune_grid <- expand.grid(C = c( 0.1, 1, 10, 100, 1000))

# Train the SVM model using radial kernel with the tuning grid
svm_model <- train(y_train ~ ., data = train_data,
                   method = "svmLinear",
                   tuneGrid = tune_grid,
                   trControl = train_control,scaled = TRUE)

# Print the results of the tuning
print(svm_model)
## Support Vector Machines with Linear Kernel 
## 
## 8131 samples
##   39 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 6504, 6505, 6505, 6506, 6504 
## Resampling results across tuning parameters:
## 
##   C      Accuracy   Kappa    
##   1e-01  0.7422242  0.4695042
##   1e+00  0.7841613  0.5544567
##   1e+01  0.8463913  0.6827210
##   1e+02  0.9105898  0.8172768
##   1e+03  0.9477331  0.8939740
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 1000.
# Best model parameters
best_params <- svm_model$bestTune
cat("Best Parameters:\n")
## Best Parameters:
print(best_params)
##      C
## 5 1000

For linear model it could be seen that at C = 100, we attain an accuracy of 94 percent which suggests that its better to assume that the y was linearly seperable. Hence, now lets find the accuracy, ROC, AUC score of the test data.

Bias variance MSE caluclation

# Load necessary libraries
library(e1071)
library(ggplot2)
library(reshape2)

# Assuming you have already defined X_train, y_train, X_test, y_test

# Combine training and test sets into data frames
train_data <- as.data.frame(cbind(X_train, y_train))
test_data <- as.data.frame(X_test)

# Initialize vectors to store metrics
C_values <- c(0.1, 1, 10, 100, 1000, 100000, 500000)
bias_values <- c()
variance_values <- c()
mse_values <- c()

# Loop through different values of C
for (C in C_values) {
  # Train the SVM model
  svm_model <- svm(y_train ~ ., data = train_data, kernel = "linear", cost = C, scale = TRUE)
  
  # Predict on the test set
  predictions <- predict(svm_model, newdata = test_data)
  
  # Convert predictions and actual test values to numeric for calculations
  predictions_numeric <- as.numeric(as.character(predictions))
  y_test_numeric <- as.numeric(as.character(y_test))
  
  # Calculate Bias: Mean squared difference between the true mean and the predicted mean
  mean_true <- mean(y_test_numeric)
  mean_pred <- mean(predictions_numeric)
  bias <- (mean_true - mean_pred)^2
  bias_values <- c(bias_values, bias)
  
  # Calculate Variance: Variability of the predictions
  variance <- var(predictions_numeric)
  variance_values <- c(variance_values, variance)
  
  # Calculate Mean Squared Error (MSE): Average squared error
  mse <- mean((y_test_numeric - predictions_numeric)^2)
  mse_values <- c(mse_values, mse)
}

# Combine results into a data frame for plotting
results <- data.frame(
  C = C_values,
  Bias = bias_values,
  Variance = variance_values,
  MSE = mse_values
)

# Reshape data for ggplot
results_melted <- melt(results, id.vars = "C", variable.name = "Metric", value.name = "Value")

# Plotting
ggplot(results_melted, aes(x = factor(C), y = Value, color = Metric, group = Metric)) +
  geom_line(size = 1) +
  geom_point(size = 3) +
  labs(title = "Bias-Variance Tradeoff and MSE for Different C Values",
       x = "C Values",
       y = "Value",
       color = "Metric") +
  theme_minimal(base_size = 15) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.position = "top"
  ) +
  scale_color_manual(values = c("Bias" = "blue", "Variance" = "red", "MSE" = "green")) +
  scale_x_discrete(labels = function(x) paste0("C = ", x)) +
  geom_text(aes(label = round(Value, 2)), vjust = -0.5)  # Add value labels above points

The analysis of bias, variance, and mean squared error (MSE) across different values of C shows: MSE dip: At C = 1000, the MSE is the minimum. Low Bias and Variance: The variance seems to be little higher which is expected for large C but it is not much high compared to 10, 100.

Hence, better to select C = 1000 for the SVM model to achieve an optimal balance between bias, variance, and MSE.

Model Evalution for Test Data

# Load necessary libraries
library(e1071)
library(pROC)  # For ROC and AUC

# Assuming you have already defined X_train, y_train, X_test, y_test

# Combine the training data into a data frame
train_data <- as.data.frame(cbind(X_train, y_train))

# Fit the SVM model with linear kernel
svm_model <- svm(y_train ~ ., data = train_data, kernel = "linear", cost = 1000, decision.values = TRUE,scaled = TRUE)

# Step 1: Make predictions on the test set
predictions <- predict(svm_model, newdata = as.data.frame(X_test))

# Step 2: Create confusion matrix
confusion_matrix <- table(Predicted = predictions, Actual = y_test)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(confusion_matrix)
##          Actual
## Predicted    0    1
##         0 1435  118
##         1   37 1119
confusion_matrix_caret <- confusionMatrix(confusion_matrix)

# Step 2: Extract Precision, Recall, and F1 Score
precision <- confusion_matrix_caret$byClass['Precision']
recall <- confusion_matrix_caret$byClass['Recall']
f1_score <- confusion_matrix_caret$byClass['F1']

# Display the metrics
cat("Precision:", precision, "\n")
## Precision: 0.924018
cat("Recall:", recall, "\n")
## Recall: 0.9748641
cat("F1 Score:", f1_score, "\n")
## F1 Score: 0.9487603
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9427833
# Step 4: Get decision values for ROC curve
fitted <- attributes(predict(svm_model, newdata = as.data.frame(X_test), decision.values = TRUE))$decision.values

# Step 5: Generate ROC plot for the test set
roc_curve <- roc(y_test, -fitted)  # Note: Use negative for class labeling

# Plot the ROC curve
plot(roc_curve, main = "ROC Curve for Test Data")
# Add AUC to the plot
auc_value <- auc(roc_curve)
legend("bottomright", legend = paste("AUC =", round(auc_value, 2)), bty = "n")

The model achieved an impressive accuracy of 94% and a high AUC of 0.99, indicating excellent performance in classification. Additionally, the high precision, recall, and F1-score values reflect the model’s capability to accurately classify both class 0 and class 1. These results suggest that the model is highly effective in distinguishing between the two classes, making it reliable for practical applications.

Feature importance for the model

# Assuming you have already trained your SVM model (svm_model) using e1071

# Get coefficients from the SVM model
coefficients <- as.vector(svm_model$coefs) %*% svm_model$SV

# Get the intercept term
intercept <- svm_model$rho

# Combine coefficients and intercept into a single vector
all_coefficients <- c(intercept, coefficients)

# Print coefficients
cat("Coefficients (including intercept):\n")
## Coefficients (including intercept):
print(all_coefficients)
##  [1]  8.247485e+01 -2.487294e-02 -5.763023e+02  8.432842e-03  2.044164e+01
##  [6]  3.659358e-02  3.208561e-02 -1.820126e-10  2.885656e-10 -3.780087e-10
## [11]  2.099227e-10 -1.011346e-02  1.125534e-02 -1.948308e-10 -1.516675e-03
## [16] -2.227913e-02 -2.293632e-11  4.443089e-02 -3.298720e-02  3.647356e-02
## [21]  2.196903e-02 -3.623512e-02  6.645266e-03 -1.157776e-02  2.202244e-02
## [26]  1.582016e-02  2.841151e-02  6.831574e-02  3.257688e-02  2.763410e-03
## [31] -1.678191e-02 -1.424158e-02 -5.788365e-10 -2.544958e-02  7.363344e-03
## [36]  2.185389e-02 -1.748155e-02 -7.793704e-03 -1.146568e-02 -1.004163e-02
# Check the number of coefficients
num_coefficients <- length(all_coefficients)
cat("Number of Coefficients (including intercept):", num_coefficients, "\n")
## Number of Coefficients (including intercept): 40
# Get feature names
feature_names <- colnames(X_train)

# Create a named vector for coefficients with feature names
named_coefficients <- setNames(coefficients, feature_names)

# Print named coefficients
cat("Feature Coefficients:\n")
## Feature Coefficients:
print(named_coefficients)
##           Rating   Reviews        Size    Price Content.Rating Last.Updated
## [1,] -0.02487294 -576.3023 0.008432842 20.44164     0.03659358   0.03208561
##      catART_AND_DESIGN catAUTO_AND_VEHICLES     catBEAUTY
## [1,]     -1.820126e-10         2.885656e-10 -3.780087e-10
##      catBOOKS_AND_REFERENCE catBUSINESS  catCOMICS catCOMMUNICATION
## [1,]           2.099227e-10 -0.01011346 0.01125534    -1.948308e-10
##         catDATING catEDUCATION catENTERTAINMENT  catEVENTS  catFAMILY
## [1,] -0.001516675  -0.02227913    -2.293632e-11 0.04443089 -0.0329872
##      catFINANCE catFOOD_AND_DRINK     catGAME catHEALTH_AND_FITNESS
## [1,] 0.03647356        0.02196903 -0.03623512           0.006645266
##      catHOUSE_AND_HOME catLIBRARIES_AND_DEMO catLIFESTYLE
## [1,]       -0.01157776            0.02202244   0.01582016
##      catMAPS_AND_NAVIGATION catMEDICAL catNEWS_AND_MAGAZINES catPARENTING
## [1,]             0.02841151 0.06831574            0.03257688   0.00276341
##      catPERSONALIZATION catPHOTOGRAPHY catPRODUCTIVITY catSHOPPING   catSOCIAL
## [1,]        -0.01678191    -0.01424158   -5.788365e-10 -0.02544958 0.007363344
##       catSPORTS    catTOOLS catTRAVEL_AND_LOCAL catVIDEO_PLAYERS  catWEATHER
## [1,] 0.02185389 -0.01748155        -0.007793704      -0.01146568 -0.01004163
## attr(,"names")
##  [1] "Rating"                 "Reviews"                "Size"                  
##  [4] "Price"                  "Content.Rating"         "Last.Updated"          
##  [7] "catART_AND_DESIGN"      "catAUTO_AND_VEHICLES"   "catBEAUTY"             
## [10] "catBOOKS_AND_REFERENCE" "catBUSINESS"            "catCOMICS"             
## [13] "catCOMMUNICATION"       "catDATING"              "catEDUCATION"          
## [16] "catENTERTAINMENT"       "catEVENTS"              "catFAMILY"             
## [19] "catFINANCE"             "catFOOD_AND_DRINK"      "catGAME"               
## [22] "catHEALTH_AND_FITNESS"  "catHOUSE_AND_HOME"      "catLIBRARIES_AND_DEMO" 
## [25] "catLIFESTYLE"           "catMAPS_AND_NAVIGATION" "catMEDICAL"            
## [28] "catNEWS_AND_MAGAZINES"  "catPARENTING"           "catPERSONALIZATION"    
## [31] "catPHOTOGRAPHY"         "catPRODUCTIVITY"        "catSHOPPING"           
## [34] "catSOCIAL"              "catSPORTS"              "catTOOLS"              
## [37] "catTRAVEL_AND_LOCAL"    "catVIDEO_PLAYERS"       "catWEATHER"
# Sort coefficients by absolute value for feature importance
sorted_coefficients <- sort(abs(named_coefficients), decreasing = TRUE)

# Print sorted feature importance
cat("Sorted Feature Importance:\n")
## Sorted Feature Importance:
print(sorted_coefficients)
##                Reviews                  Price             catMEDICAL 
##           5.763023e+02           2.044164e+01           6.831574e-02 
##              catEVENTS         Content.Rating             catFINANCE 
##           4.443089e-02           3.659358e-02           3.647356e-02 
##                catGAME              catFAMILY  catNEWS_AND_MAGAZINES 
##           3.623512e-02           3.298720e-02           3.257688e-02 
##           Last.Updated catMAPS_AND_NAVIGATION            catSHOPPING 
##           3.208561e-02           2.841151e-02           2.544958e-02 
##                 Rating           catEDUCATION  catLIBRARIES_AND_DEMO 
##           2.487294e-02           2.227913e-02           2.202244e-02 
##      catFOOD_AND_DRINK              catSPORTS               catTOOLS 
##           2.196903e-02           2.185389e-02           1.748155e-02 
##     catPERSONALIZATION           catLIFESTYLE         catPHOTOGRAPHY 
##           1.678191e-02           1.582016e-02           1.424158e-02 
##      catHOUSE_AND_HOME       catVIDEO_PLAYERS              catCOMICS 
##           1.157776e-02           1.146568e-02           1.125534e-02 
##            catBUSINESS             catWEATHER                   Size 
##           1.011346e-02           1.004163e-02           8.432842e-03 
##    catTRAVEL_AND_LOCAL              catSOCIAL  catHEALTH_AND_FITNESS 
##           7.793704e-03           7.363344e-03           6.645266e-03 
##           catPARENTING              catDATING        catPRODUCTIVITY 
##           2.763410e-03           1.516675e-03           5.788365e-10 
##              catBEAUTY   catAUTO_AND_VEHICLES catBOOKS_AND_REFERENCE 
##           3.780087e-10           2.885656e-10           2.099227e-10 
##       catCOMMUNICATION      catART_AND_DESIGN       catENTERTAINMENT 
##           1.948308e-10           1.820126e-10           2.293632e-11
top_coef = head(sorted_coefficients,15)
# Optional: Visualize feature importance
barplot(
  top_coef,
  main = "Feature Importance from SVM Coefficients",
  xlab = "Features",
  col = "steelblue",
  las = 2,
  cex.names = 0.3,# Adjust name size if necessary
  horiz = TRUE
)

The analysis shows that Reviews, Price, and Rating are the top features influencing the model’s performance. Additionally, the leading app categories—Entertainment, Medical, Sports Education, and Finance —suggest that focusing on these areas can enhance the chances of app success.